Linear Regressions

I created several linear regression models using temperature as the dependent variable.

Temperature vs. Low Cloud Coverage

temp_lowc <- lm(temperature ~ cloudlow,data = combine)
temp_lowc %>% tidy() %>% as.data.frame()
##          term    estimate std.error statistic      p.value
## 1 (Intercept) 277.7132586 2.7138125 102.33325 5.691620e-78
## 2    cloudlow   0.6569496 0.1328456   4.94521 5.022951e-06
lowcg <- ggplot(combine,aes(x=cloudlow,y=temperature))+
         geom_point()+
         xlab("Low Cloud Coverage")+ylab("Temperature")+
         geom_abline(intercept=277.7133,slope=0.6569,col="red")
lowcg

Temperature vs. Middle Cloud Coverage

temp_midc <- lm(temperature ~ cloudmid,data = combine)
temp_midc %>% tidy() %>% as.data.frame()
##          term   estimate std.error statistic      p.value
## 1 (Intercept) 313.376097 2.2739709 137.81008 5.642312e-87
## 2    cloudmid  -1.167782 0.1110144 -10.51919 4.670486e-16
midcg <- ggplot(combine,aes(x=cloudmid,y=temperature))+
         geom_point()+
         xlab("Middle Cloud Coverage")+ylab("Temperature")+
         geom_abline(intercept=313.376,slope=-1.168,col="red")
midcg

Temperature vs. High Cloud Coverage

temp_highc <- lm(temperature ~ cloudhigh,data = combine)
temp_highc %>% tidy() %>% as.data.frame()
##          term    estimate std.error  statistic      p.value
## 1 (Intercept) 298.8933145 1.7554467 170.266243 2.193430e-93
## 2   cloudhigh  -0.8519088 0.1541152  -5.527739 5.221285e-07
highcg <- ggplot(combine,aes(x=cloudhigh,y=temperature))+
          geom_point()+
          xlab("High Cloud Coverage")+ylab("Temperature")+
          geom_abline(intercept=298.8933,slope=-0.8519,col="red")
highcg

Temperature vs. Ozone

temp_ozone <- lm(temperature ~ ozone,data = combine)
temp_ozone %>% tidy() %>% as.data.frame()
##          term    estimate   std.error statistic      p.value
## 1 (Intercept) 337.8292030 12.16953566 27.760238 1.634629e-39
## 2       ozone  -0.1525832  0.03904494 -3.907888 2.124625e-04
ozoneg <- ggplot(combine,aes(x=ozone,y=temperature))+ 
          geom_point()+
          xlab("Ozone Level")+ylab("Temperature")+
          geom_abline(intercept=337.8292,slope=-0.1526,col="red")
ozoneg

Temperature vs. Surface Temperature

temp_surftemp <- lm(temperature ~ surftemp,data = combine)
temp_surftemp %>% tidy() %>% as.data.frame()
##          term   estimate  std.error statistic      p.value
## 1 (Intercept) 86.0301279 10.9520922  7.855132 3.376424e-11
## 2    surftemp  0.7002365  0.0374963 18.674818 6.705231e-29
surfg <- ggplot(combine,aes(x=surftemp,y=temperature))+
         geom_point()+
         xlab("Surface Temperature")+ylab("Temperature")+
         geom_abline(intercept=86.0301,slope=0.7002,col="red")
surfg

Temperature vs. Pressure

temp_pres <- lm(temperature ~ pressure,data = combine)
temp_pres %>% tidy() %>% as.data.frame()
##          term     estimate   std.error statistic      p.value
## 1 (Intercept) 259.84483680 20.06505232 12.950120 2.938691e-20
## 2    pressure   0.03408459  0.02234757  1.525203 1.317141e-01
presg <- ggplot(combine,aes(x=pressure,y=temperature))+
         geom_point()+
         xlab("Atmospheric Pressure")+ylab("Temperature")+
         geom_abline(intercept=259.84484,slope=0.03408,col="red")
presg



Used to combine all graphs into one figure.

figure <- ggarrange(lowcg,midcg,highcg,ozoneg,surfg,presg ,ncol = 3,nrow=2)
figure

Regression with Multiple Variables

From the linear regressions, pressure was the only variable that did not correlate with temperature. Therefore, the multiple linear regression model will not use that variable for predictions.

model <- lm(temperature ~ cloudlow+cloudmid+cloudhigh+ozone+surftemp,data=combine)
summary(model)
## 
## Call:
## lm(formula = temperature ~ cloudlow + cloudmid + cloudhigh + 
##     ozone + surftemp, data = combine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7001 -1.7232 -0.0064  1.7982  4.8737 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 20.43522   22.89516   0.893    0.375    
## cloudlow    -0.65061    0.10505  -6.194 4.27e-08 ***
## cloudmid     0.16998    0.10875   1.563    0.123    
## cloudhigh   -0.43951    0.08269  -5.315 1.35e-06 ***
## ozone        0.01669    0.02003   0.833    0.408    
## surftemp     0.95383    0.06855  13.915  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.427 on 66 degrees of freedom
## Multiple R-squared:  0.9261, Adjusted R-squared:  0.9205 
## F-statistic: 165.3 on 5 and 66 DF,  p-value: < 2.2e-16

Predictions

Chose 50 random data points from the NASA data set (some listed below):

temp_pred <- sample_n(dfnasa,50) 
head(temp_pred)
##          lat       long month year cloudhigh cloudlow cloudmid ozone
## 1 -21.200000  -73.73043     5 2000       8.0     48.0     30.5   262
## 2  18.730435  -73.73043     5 1998      11.5     21.5     18.0   274
## 3  33.704348  -81.24348     2 2000       6.5     20.5     19.5   318
## 4  16.234783 -101.27826     8 1999      36.0     18.0     21.5   276
## 5  -6.226087  -73.73043     1 1999      45.5      4.0     30.5   246
## 6   1.260870  -86.25217     9 1998       3.5     54.5     13.0   266
##   pressure surftemp temperature
## 1     1000    290.7       294.1
## 2      995    301.0       301.9
## 3     1000    283.2       288.3
## 4     1000    301.0       303.2
## 5     1000    293.6       301.0
## 6     1000    294.1       298.7

Data frame of 50 random rows from the NASA data set.

model_usage <- temp_pred %>% select(cloudhigh,cloudlow,cloudmid,ozone,surftemp)
real_temp <- temp_pred %>% select(temperature)

head(model_usage)
##   cloudhigh cloudlow cloudmid ozone surftemp
## 1       8.0     48.0     30.5   262    290.7
## 2      11.5     21.5     18.0   274    301.0
## 3       6.5     20.5     19.5   318    283.2
## 4      36.0     18.0     21.5   276    301.0
## 5      45.5      4.0     30.5   246    293.6
## 6       3.5     54.5     13.0   266    294.1

The model_usage variable was used to find the prediction while storing the actual temperature in real_temp.

model_predictions <- model_usage %>% add_predictions(model)

head(model_predictions)
##   cloudhigh cloudlow cloudmid ozone surftemp     pred
## 1       8.0     48.0     30.5   262    290.7 272.5247
## 2      11.5     21.5     18.0   274    301.0 296.1274
## 3       6.5     20.5     19.5   318    283.2 282.9865
## 4      36.0     18.0     21.5   276    301.0 288.2649
## 5      45.5      4.0     30.5   246    293.6 287.1691
## 6       3.5     54.5     13.0   266    294.1 270.6086
head(real_temp)
##   temperature
## 1       294.1
## 2       301.9
## 3       288.3
## 4       303.2
## 5       301.0
## 6       298.7

Graph of Predictions

actual_preddf <- data.frame(cbind(real_temp, model_predictions$pred))
colnames(actual_preddf) = c("real","prediction")

ggplotly(ggplot(actual_preddf)+
         geom_point(aes(x=real,y=prediction))+
         geom_abline(intercept=0,slope=1,col="darkturquoise",size=1)+
         xlab("Real Temperature")+ylab("Predicted Temperature"))